knitr::opts_chunk$set(echo = TRUE)
# install.packages("Hmisc")
# install.packages("pastecs")
# install.packages("ggplot2")
# install.packages("Hmisc")
# install.packages("fastDummies")
# install.packages("lmtest")
# install.packages("lmtest")
# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("GGally")
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(leaps)
library(reshape2)
library(gridExtra)
library(fastDummies)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(pastecs)
library(skimr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.6      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.0 
## ✔ readr   2.1.2      ✔ forcats 0.5.1 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()     masks ggplot2::%+%()
## ✖ psych::alpha()   masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand()  masks Matrix::expand()
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::first()   masks pastecs::first()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::last()    masks pastecs::last()
## ✖ purrr::lift()    masks caret::lift()
## ✖ tidyr::pack()    masks Matrix::pack()
## ✖ tidyr::unpack()  masks Matrix::unpack()
library(caret)
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## 
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(psych)
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:gridExtra':
## 
##     combine
## 
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(nnet)
library(ROCR)
library(Metrics)
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(caret)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## 
## Attaching package: 'forecast'
## 
## The following object is masked from 'package:Metrics':
## 
##     accuracy
## 
## The following object is masked from 'package:caretEnsemble':
## 
##     autoplot
library(rpart)
library(rattle)
## Loading required package: bitops
## 
## Attaching package: 'bitops'
## 
## The following object is masked from 'package:Matrix':
## 
##     %&%
## 
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## 
## The following object is masked from 'package:randomForest':
## 
##     importance
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following object is masked from 'package:purrr':
## 
##     compact
library(rlist)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following object is masked from 'package:Metrics':
## 
##     auc
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
# reading file
df = readr::read_csv("caravan-insurance-challenge.csv")
## Rows: 9822 Columns: 87
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): ORIGIN
## dbl (86): MOSTYPE, MAANTHUI, MGEMOMV, MGEMLEEF, MOSHOOFD, MGODRK, MGODPR, MG...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Step 1 Data Visulation and exploration

dimension = dim(df)
# total number of observations are 9822
# total number of variables are 87 

totalRows = dimension[1]

# data type of 86 variables is number and 1 variable is char, our target variable is number of policies bought remaining variables are categorical numeric.

head(df)
## # A tibble: 6 × 87
##   ORIGIN MOSTYPE MAANTHUI MGEMOMV MGEMLEEF MOSHOOFD MGODRK MGODPR MGODOV MGODGE
##   <chr>    <dbl>    <dbl>   <dbl>    <dbl>    <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 train       33        1       3        2        8      0      5      1      3
## 2 train       37        1       2        2        8      1      4      1      4
## 3 train       37        1       2        2        8      0      4      2      4
## 4 train        9        1       3        3        3      2      3      2      4
## 5 train       40        1       4        2       10      1      4      1      4
## 6 train       23        1       2        1        5      0      5      0      5
## # … with 77 more variables: MRELGE <dbl>, MRELSA <dbl>, MRELOV <dbl>,
## #   MFALLEEN <dbl>, MFGEKIND <dbl>, MFWEKIND <dbl>, MOPLHOOG <dbl>,
## #   MOPLMIDD <dbl>, MOPLLAAG <dbl>, MBERHOOG <dbl>, MBERZELF <dbl>,
## #   MBERBOER <dbl>, MBERMIDD <dbl>, MBERARBG <dbl>, MBERARBO <dbl>, MSKA <dbl>,
## #   MSKB1 <dbl>, MSKB2 <dbl>, MSKC <dbl>, MSKD <dbl>, MHHUUR <dbl>,
## #   MHKOOP <dbl>, MAUT1 <dbl>, MAUT2 <dbl>, MAUT0 <dbl>, MZFONDS <dbl>,
## #   MZPART <dbl>, MINKM30 <dbl>, MINK3045 <dbl>, MINK4575 <dbl>, …
# explore top 5 rows

Descriptive analysis

# summary of main data set
# stat.desc(df)
# summary(df)

# missing values in main data set
paste0("Total missing values:", sum(is.na(df)))
## [1] "Total missing values:0"
desc = skim(df)

Plot graphs

unInsuredRows = df[df$CARAVAN == 0,]
insuredRows = df[df$CARAVAN == 1,]

peopleNotInsuredWithCaravan  = nrow(unInsuredRows);
peopleInsuredWithCaravan = nrow(insuredRows);


ratioOfPeopleWhoDontBought = peopleNotInsuredWithCaravan/totalRows * 100
ratioOfPeopleWhoBought = peopleInsuredWithCaravan/totalRows * 100
# 5.96% who bought insurance vs 94% who didn't buy the insurance, by statistics it looks like company is in loss people are not interested in buying they have to something to improve.

dat <- data.frame(
  policy_status = factor(c("Not Insured","Insured"), levels=c("Not Insured","Insured")),
  Count = c( peopleNotInsuredWithCaravan , peopleInsuredWithCaravan)
)

ggplot(data=dat, aes(x=policy_status, y=Count, fill=policy_status)) +
  geom_bar(colour="red", stat="identity")

Customer Main Type

ggplot(df,aes(factor(df$MOSHOOFD))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Main type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Caravan Policy based on Customer Main Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$maintype = df$MOSHOOFD

nrow(df[df$maintype == 1 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 2 & df$CARAVAN == 1,])
## [1] 103
nrow(df[df$maintype == 3 & df$CARAVAN == 1,])
## [1] 109
nrow(df[df$maintype == 4 & df$CARAVAN == 1,])
## [1] 0
nrow(df[df$maintype == 5 & df$CARAVAN == 1,])
## [1] 18
nrow(df[df$maintype == 6 & df$CARAVAN == 1,])
## [1] 9
nrow(df[df$maintype == 7 & df$CARAVAN == 1,])
## [1] 35
nrow(df[df$maintype == 8 & df$CARAVAN == 1,])
## [1] 151
nrow(df[df$maintype == 9 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 10 & df$CARAVAN == 1,])
## [1] 11
# Here we wanted to see the which customer main type has the highest frequency/count of buying the insurance. Based on results, we see that there are atleast 4 main customer categories that buy insurance. However, for our ease and understanding purposes we will only consider the top 2. This brings us to select, category number 8 and 3, where 8 = Family with grown ups and 3 = Driven Growths

Customer sub type

ggplot(df,aes(factor(df$MOSTYPE))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    labs(x="Customer Sub type") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy Bought based on Customer sub Type") +
    theme(plot.title = element_text(hjust = 0.5))

df$subtype = df$MOSTYPE
nrow(df[df$subtype == 33 & df$CARAVAN == 1,])
## [1] 80
nrow(df[df$subtype == 8 & df$CARAVAN == 1,])
## [1] 72
# category 33 and 8 purchased more policies.
# Based on our main category which compromises of various sub-categories, we can see that sub-categories number 33 and 8 are porne to buying insurance. These should be considered as the characteristics/attributes of the types of customers that exist in the main customer category. hence, we could say that, those who are middle class and those who are low class but have large families have a higher chance of getting the insurance 

Age

ggplot(df,aes(factor(df$MGEMLEEF))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Age Group") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Policy bought on age group") +
    theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

df$age = df$MGEMLEEF
nrow(df[df$age == 1 & df$CARAVAN == 1,])
## [1] 1
nrow(df[df$age == 2 & df$CARAVAN == 1,])
## [1] 156
nrow(df[df$age == 3 & df$CARAVAN == 1,])
## [1] 303
nrow(df[df$age == 4 & df$CARAVAN == 1,])
## [1] 105
nrow(df[df$age == 5 & df$CARAVAN == 1,])
## [1] 20
nrow(df[df$age == 6 & df$CARAVAN == 1,])
## [1] 1
# Here we have explored to see what is the age range of the customers that buy the insurance. Based on our analysis we see that customers who are between the ages 40-50 are prone to buying insurance compared with others. Hence, we could say that it is among the many characteristics of the main customer group [3,8]

No of houses

ggplot(df,aes(factor(df$MAANTHUI))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of houses") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of houses customer has who bought insurance") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhouses = df$MAANTHUI
nrow(df[df$noofhouses == 1 & df$CARAVAN == 1,])
## [1] 526
nrow(df[df$noofhouses == 2 & df$CARAVAN == 1,])
## [1] 59
nrow(df[df$noofhouses == 3 & df$CARAVAN == 1,])
## [1] 1
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

No of house hold

ggplot(df,aes(factor(df$MGEMOMV))) + 
    geom_bar(aes(fill = factor(df$CARAVAN))) + 
    geom_text(stat='count', aes(label=..count..), vjust=0) + 
    labs(x="Number of house hold") +
    scale_fill_discrete(name = "CARAVAN") + 
    ggtitle("Number of house hold") +
    theme(plot.title = element_text(hjust = 0.5))

df$noofhousehold = df$MGEMOMV
nrow(df[df$noofhousehold == 1 & df$CARAVAN == 1,])
## [1] 11
nrow(df[df$noofhousehold == 2 & df$CARAVAN == 1,])
## [1] 195
nrow(df[df$noofhousehold == 3 & df$CARAVAN == 1,])
## [1] 275
df$hasThreeHouseHold = ifelse(df$noofhousehold == 3, 1, 0)
# Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.

Charactertics we found so far

Customer having 3 house hold

Customer have one house

Age of customer is between 40 to 50

Customer are Driven Growers

Customer belongs to Lower class large families

Correlation Analysis

corrplot(cor(df[, c("subtype","maintype", "age", "noofhouses", "noofhousehold", "CARAVAN")]), method = "number")

# From the correlation matrix we have below, we have some interesting insights. The reason to run the correlation matrix was to figure out variables of interest which might cause either overfitting or underfitting. # Here we can see that the variables of interest are positively correlated. One exception to this is the correlation between age and number of households. We see that there is a negative correlation between it. Which actually makes sense because, the greater the age, the no of households will decrease. 

Which varaible to select when they are highly postive correlated?

cor(df$subtype, df$CARAVAN)
## [1] -0.06074174
cor(df$maintype, df$CARAVAN)
## [1] -0.05930648
# we will choose which have high corelation with response variable in this case both are weak correlated doesn't matter what we really choose
# we will choose maintype

Categorizing Data

Number of Houses

table(df$MAANTHUI)
## 
##    1    2    3    4    5    6    7    8   10 
## 8915  821   64    4    3    3    8    2    2
df$MAANTHUI = replace(df$MAANTHUI, df$MAANTHUI > 2, 2)
df$OneHouse = ifelse(df$MAANTHUI ==1, 1, 0)
df$moreThanTwoHouse = ifelse(df$MAANTHUI > 2, 1, 0)


# by looking frequencies of houses we categorized houses into dummies.

Grouping sub customer to meanful customer type.

df$averageFamily = ifelse(df$MOSTYPE %in% c(12,11,9,10,13), 1, 0)
df$loners = ifelse(df$MOSTYPE %in% c(17,15,18,16,19), 1, 0)
df$conservativeFamilies = ifelse(df$MOSTYPE %in% c(39,38), 1, 0)
df$crusingSeniors = ifelse(df$MOSTYPE %in% c(26,25,28,27), 1, 0)
df$drivenGrowers = ifelse(df$MOSTYPE %in% c(6,7,8), 1, 0)
df$grownups = ifelse(df$MOSTYPE %in% c(33,34,35,36,37), 1, 0)
df$framers = ifelse(df$MOSTYPE %in% c(40,41), 1, 0)
df$livingWell = ifelse(df$MOSTYPE %in% c(20,21,22,23,24), 1, 0)
df$retired = ifelse(df$MOSTYPE %in% c(29,30,31,32), 1, 0)
df$successful = ifelse(df$MOSTYPE %in% c(1,2,3,4,5), 1, 0)

dat <- data.frame(
  Categorized_Customers = factor(c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful"), levels=c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful")),
  Count = c( sum(df$averageFamily), sum(df$loners), sum(df$conservativeFamilies), sum(df$crusingSeniors), sum(df$drivenGrowers), sum(df$grownups), sum(df$framers), sum(df$livingWell), sum(df$retired), sum(df$successful) )
)

ggplot(data=dat, aes(x=Categorized_Customers, y=Count, fill=Categorized_Customers)) +
  geom_bar(colour="red", stat="identity")

Income Conversion

# Converting 30k income into value
df$MINKM30_c = ifelse(df$MINKM30 == 1, 0.05 * 30000, df$MINKM30)
df$MINKM30_c = ifelse(df$MINKM30_c == 2, 0.17 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 3, 0.3 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 4, 0.43 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 5, 0.56 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 6, 0.69 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 7, 0.82 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 8, 0.94 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 9, 1 * 30000, df$MINKM30_c)


# Converting 45k income into value
df$MINK3045_c = ifelse(df$MINK3045 == 1, 0.05 * 45000, df$MINK3045)
df$MINK3045_c = ifelse(df$MINK3045_c == 2, 0.17 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 3, 0.3 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 4, 0.43 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 5, 0.56 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 6, 0.69 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 7, 0.82 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 8, 0.94 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 9, 1 * 45000, df$MINK3045_c)

# Converting 70k income into value
df$MINK4575_c = ifelse(df$MINK4575 == 1, 0.05 * 75000, df$MINK4575)
df$MINK4575_c = ifelse(df$MINK4575_c == 2, 0.17 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 3, 0.3 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 4, 0.43 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 5, 0.56 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 6, 0.69 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 7, 0.82 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 8, 0.94 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 9, 1 * 75000, df$MINK4575_c)

# Converting 122k income into value
df$MINK7512_c = ifelse(df$MINK7512 == 1, 0.05 * 122000, df$MINK7512)
df$MINK7512_c = ifelse(df$MINK7512_c == 2, 0.17 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 3, 0.3 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 4, 0.43 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 5, 0.56 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 6, 0.69 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 7, 0.82 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 8, 0.94 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 9, 1 * 122000, df$MINK7512_c)

# Converting 123k income into value
df$MINK123M_c = ifelse(df$MINK123M == 1, 0.05 * 123000, df$MINK123M)
df$MINK123M_c = ifelse(df$MINK123M_c == 2, 0.17 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 3, 0.3 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 4, 0.43 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 5, 0.56 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 6, 0.69 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 7, 0.82 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 8, 0.94 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 9, 1 * 123000, df$MINK123M_c)

# Average income
df$MINKGEM_c = (df$MINK123M_c + df$MINK7512_c + df$MINK4575_c + df$MINK3045_c + df$MINKM30_c)/5
hist(df$MINKGEM_c)

Converting age into numerical.

df$MGEMLEEF_c = ifelse(df$MGEMLEEF == 1, 25, df$MGEMLEEF)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 2, 35, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 3, 45, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 55, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 5, 65, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 75, df$MGEMLEEF_c)
# MOSTYPE : customer subtype
# MFWEKIND: Household with children
# MOPLLAAG: Lower level education
# MHHUUR :  Rented house
# MHKOOP :  Home owners
# MINKM30:  Income < 30.000 low income
# MINK7512: Income 75-122.000 high income
# MKOOPKLA: Purchasing power class
# PPERSAUT: Contribution car policies
# CARAVAN:  Number of mobile home policies 0 - 1

Out liers

plot_ly(x = ~df$MINKM30_c, y = ~df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
# After having converted the categorical variable for income to numerical, here we take income less than 3000 against customer sub-categories and draw a box plot. From the below box plot, we can see that as soon as income crosses 10K, we begint to see a few outliers for certain sub-categories. Another important thing to note is that when income jumps above 20K, the distance of outliers starts to increase. We could say that we are seeing extreme outliers in income levels ranging from 25K to 30K. 

# Now the question is if we should keep outliers in our analysis, whether mild or extreme or delete both of them and then proceed?
plot_ly(y = ~df$MOSTYPE, x = ~df$MINK3045_c, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(x = df$MINK4575_c, y = df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed

Class Im balance problem

prop.table(table(df$CARAVAN))
## 
##          0          1 
## 0.94033802 0.05966198
# 94% people didn't buy insurance and only 5% bought. under sampling problem

barplot(prop.table(table(df$CARAVAN)), col = rainbow(2), ylim = c(0,0.7), main = "Class Distribution")

Partition data

# training data
df_train = (df[df$ORIGIN == "train",])
df_train = (df_train[,-1])
nrow(df_train)
## [1] 5822
#testing data
df_test = (df[df$ORIGIN == "test",])
df_test = (df_test[,-1])
nrow(df_test)
## [1] 4000
table(df_test$CARAVAN)
## 
##    0    1 
## 3762  238
over_train = ovun.sample(CARAVAN ~ ., data =df_train, method = "over", N =10948)$data
table(over_train$CARAVAN)
## 
##    0    1 
## 5474 5474
over_test = ovun.sample(CARAVAN ~ ., data =df_test, method = "over", N =nrow(df_test))$data
table(over_test$CARAVAN)
## 
##    0    1 
## 3762  238
# we are not fixing sampling problem in test data, i did this becuase i was having error with doing prediction

Convert train data set into factor

for(i in 1:ncol(over_train)){
over_train[,i] <- as.factor(over_train[,i])
}

Convert test data set into factor

for(i in 1:ncol(over_test)){
over_test[,i] <- as.factor(over_test[,i])
}
over_test$MINKGEM_c = as.numeric(over_test$MINKGEM_c)
over_train$MINKGEM_c = as.numeric(over_train$MINKGEM_c)

over_test$MGEMLEEF_c = as.numeric(over_test$MGEMLEEF_c)
over_train$MGEMLEEF_c = as.numeric(over_train$MGEMLEEF_c)

Draw Confusion matrix function

drewSummary = function(model) {
  summary(model)
}
drewMatrix = function(model, test_data) {
   predicted = predict(model, test_data, type = "response")
   predictedClass = ifelse(predicted>=0.5, 1, 0)
   confusionMatrix(as.factor(predictedClass), as.factor(test_data$CARAVAN), positive = "1")
}
drewAnova = function(model1, model2){
    anova(model1, model2, test = 'Chisq')
}

Model 1

set.seed(123)

logit.reg = glm(CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c+MGEMLEEF_c, data = over_train, family = binomial (link = "logit"))

logit.reg$xlevels[["MGEMOMV"]] <- union(logit.reg$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))

drewSummary(logit.reg)
## 
## Call:
## glm(formula = CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + 
##     MGEMLEEF_c, family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7286  -1.1534   0.3669   1.1423   1.7864  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.812e-01  1.986e-01  -1.919  0.05498 .  
## MOSHOOFD2    4.726e-01  8.534e-02   5.537 3.07e-08 ***
## MOSHOOFD3   -1.701e-01  8.132e-02  -2.092  0.03646 *  
## MOSHOOFD4   -1.468e+01  1.216e+02  -0.121  0.90385    
## MOSHOOFD5   -9.203e-01  1.039e-01  -8.859  < 2e-16 ***
## MOSHOOFD6   -1.337e+00  1.593e-01  -8.392  < 2e-16 ***
## MOSHOOFD7   -8.378e-01  9.710e-02  -8.628  < 2e-16 ***
## MOSHOOFD8   -3.136e-01  7.211e-02  -4.349 1.37e-05 ***
## MOSHOOFD9   -2.523e-01  8.708e-02  -2.897  0.00377 ** 
## MOSHOOFD10  -1.459e+00  1.391e-01 -10.486  < 2e-16 ***
## MGEMOMV2     1.713e-01  1.228e-01   1.396  0.16277    
## MGEMOMV3     1.112e-01  1.257e-01   0.885  0.37637    
## MGEMOMV4     1.609e-01  1.369e-01   1.175  0.23981    
## MGEMOMV5    -5.151e-02  2.255e-01  -0.228  0.81929    
## OneHouse1    1.364e-01  7.076e-02   1.927  0.05394 .  
## MINKGEM_c    1.701e-03  2.623e-04   6.484 8.91e-11 ***
## MGEMLEEF_c   5.077e-02  3.040e-02   1.670  0.09493 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 14399  on 10931  degrees of freedom
## AIC: 14433
## 
## Number of Fisher Scoring iterations: 13
drewMatrix(logit.reg, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0  341   10
##          1 3421  228
##                                           
##                Accuracy : 0.1422          
##                  95% CI : (0.1316, 0.1535)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0063          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.95798         
##             Specificity : 0.09064         
##          Pos Pred Value : 0.06248         
##          Neg Pred Value : 0.97151         
##              Prevalence : 0.05950         
##          Detection Rate : 0.05700         
##    Detection Prevalence : 0.91225         
##       Balanced Accuracy : 0.52431         
##                                           
##        'Positive' Class : 1               
## 
# drewAnova(logit.reg)

# we did regression with/out house we see a minimal affect i.e with house model is predicting 60 true positives without it's predicting 65 and it's not significant, so we decided not to include this variable.
# difference in deviance =  Null deviance (15177) - 14398 = 870

Building model using correlation Analysis

Pre processing

train_2 = over_train

new_df  = data.frame(df_train[, -c(2,3,4,5)])
zv = apply(new_df, 2, function(x) length(unique(x)) == 1)
dfr = new_df[, !zv]
n=length(colnames(dfr))
correlationMatrix = cor(dfr[,1:n],use="complete.obs")

summary(correlationMatrix[upper.tri(correlationMatrix)])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.999554 -0.020300 -0.001239  0.005437  0.026430  1.000000
# After removing our suspected predictors we still have strong positive correlation with 1% and strong negative corelation with 99.9%, we need to find which of them are highly corelated
high = findCorrelation(correlationMatrix, cutoff = 0.75, names = TRUE)
high
##  [1] "MRELOV"        "MHKOOP"        "MZPART"        "AWALAND"      
##  [5] "APERSAUT"      "ABESAUT"       "ABROM"         "AZEILPL"      
##  [9] "AFIETS"        "AINBOED"       "subtype"       "noofhousehold"
## [13] "OneHouse"      "MINKM30_c"     "MINK3045_c"    "MINK4575_c"   
## [17] "MINKGEM_c"     "PWAPART"       "PWABEDR"       "PMOTSCO"      
## [21] "PVRAAUT"       "PAANHANG"      "PTRACTOR"      "PWERKT"       
## [25] "PLEVEN"        "PPERSONG"      "PGEZONG"       "PWAOREG"      
## [29] "PBRAND"        "PPLEZIER"      "PBYSTAND"      "MOSTYPE"      
## [33] "MINK7512"      "MINK123M"      "age"
# there are 34 variables which are correlated with each other, before dropping them we need to see how they are correlated with response variable.
target_cor_df = data.frame(CARAVAN = cor(df_train[,sort(high)], df_train[, "CARAVAN"]))


cor_df = target_cor_df[order(target_cor_df$CARAVAN,decreasing = T),,drop=F]

excludedVariables = row.names(cor_df[cor_df$CARAVAN < 0.1, ,drop=F])

excludedDummiesAndVariables = list.append(excludedVariables, 'maintype', 'age', 'noofhouses','hasThreeHouseHold', 'moreThanTwoHouse', 'averageFamily', 'loners', 'conservativeFamilies', 'crusingSeniors', 'drivenGrowers', 'grownups', 'framers','livingWell', 'retired', 'successful','MINK7512_c', 'MINK123M_c', 'MGEMLEEF_c')

excludedDummiesAndVariables
##  [1] "PWAPART"              "PBRAND"               "PPLEZIER"            
##  [4] "MHKOOP"               "MINKGEM_c"            "PBYSTAND"            
##  [7] "MINK4575_c"           "MINK7512"             "MZPART"              
## [10] "PGEZONG"              "noofhousehold"        "AFIETS"              
## [13] "PWAOREG"              "AZEILPL"              "PLEVEN"              
## [16] "AINBOED"              "PAANHANG"             "PMOTSCO"             
## [19] "age"                  "OneHouse"             "PWABEDR"             
## [22] "MINK123M"             "MINK3045_c"           "ABESAUT"             
## [25] "PPERSONG"             "PVRAAUT"              "PTRACTOR"            
## [28] "PWERKT"               "AWALAND"              "ABROM"               
## [31] "MRELOV"               "MOSTYPE"              "subtype"             
## [34] "MINKM30_c"            "maintype"             "age"                 
## [37] "noofhouses"           "hasThreeHouseHold"    "moreThanTwoHouse"    
## [40] "averageFamily"        "loners"               "conservativeFamilies"
## [43] "crusingSeniors"       "drivenGrowers"        "grownups"            
## [46] "framers"              "livingWell"           "retired"             
## [49] "successful"           "MINK7512_c"           "MINK123M_c"          
## [52] "MGEMLEEF_c"
# There are 33 variables which are less correlated with response variable and having correlation coefficient less than 0.1 so we will exclude them.
train_2 = data.frame(train_2[, !colnames(train_2) %in% excludedDummiesAndVariables])
names(train_2)
##  [1] "MAANTHUI" "MGEMOMV"  "MGEMLEEF" "MOSHOOFD" "MGODRK"   "MGODPR"  
##  [7] "MGODOV"   "MGODGE"   "MRELGE"   "MRELSA"   "MFALLEEN" "MFGEKIND"
## [13] "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG" "MBERHOOG" "MBERZELF"
## [19] "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO" "MSKA"     "MSKB1"   
## [25] "MSKB2"    "MSKC"     "MSKD"     "MHHUUR"   "MAUT1"    "MAUT2"   
## [31] "MAUT0"    "MZFONDS"  "MINKM30"  "MINK3045" "MINK4575" "MINKGEM" 
## [37] "MKOOPKLA" "PWALAND"  "PPERSAUT" "PBESAUT"  "PBROM"    "PZEILPL" 
## [43] "PFIETS"   "PINBOED"  "AWAPART"  "AWABEDR"  "APERSAUT" "AMOTSCO" 
## [49] "AVRAAUT"  "AAANHANG" "ATRACTOR" "AWERKT"   "ALEVEN"   "APERSONG"
## [55] "AGEZONG"  "AWAOREG"  "ABRAND"   "APLEZIER" "ABYSTAND" "CARAVAN"
# corrplot(cor(train_3), method = "number")
# 33 + 4 + 1 (carvan) = 38
# around 37 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.
# corelation between no of car policy and carvan
cor(df_train$APERSAUT, df_train$CARAVAN)
## [1] 0.1442105
# # corelation between contribution car policies and carvan
cor(df_train$PPERSAUT, df_train$CARAVAN)
## [1] 0.1509097
# # corelation between  Purchasing power class and carvan
cor(df_train$MKOOPKLA, df_train$CARAVAN)
## [1] 0.09593826
new_train_2 = train_2
for(i in 1:ncol(new_train_2)){
new_train_2[,i] <- as.numeric(new_train_2[,i])
}
cor_response = data.frame("ind_var" = colnames(new_train_2), "dep_var" = "CARAVAN", "cor_coeff" = 0, "p_values" = 0)

for (i in colnames(new_train_2)){
    cor_test <- cor.test(new_train_2[,i], new_train_2[,"CARAVAN"])
    cor_response[cor_response$ind_var == i, "correlation_coefficient"] = cor_test$estimate
    cor_response[cor_response$ind_var == i, "p_values"] = cor_test$p.value
}
cor_response[order(cor_response$cor_coeff, decreasing = T),]
##     ind_var dep_var cor_coeff      p_values correlation_coefficient
## 1  MAANTHUI CARAVAN         0  5.550372e-01            -0.005641560
## 2   MGEMOMV CARAVAN         0  5.532734e-15             0.074586377
## 3  MGEMLEEF CARAVAN         0  5.117892e-01             0.006270724
## 4  MOSHOOFD CARAVAN         0  7.003069e-51            -0.142670847
## 5    MGODRK CARAVAN         0  6.613645e-02             0.017562063
## 6    MGODPR CARAVAN         0  2.764330e-12             0.066730761
## 7    MGODOV CARAVAN         0  4.586077e-01             0.007083988
## 8    MGODGE CARAVAN         0  6.561039e-18            -0.082275276
## 9    MRELGE CARAVAN         0  3.170850e-57             0.151511020
## 10   MRELSA CARAVAN         0  2.154058e-12            -0.067063381
## 11 MFALLEEN CARAVAN         0  2.344442e-36            -0.119921519
## 12 MFGEKIND CARAVAN         0  9.431406e-03             0.024809685
## 13 MFWEKIND CARAVAN         0  1.239727e-10             0.061443906
## 14 MOPLHOOG CARAVAN         0  1.713677e-67             0.164779005
## 15 MOPLMIDD CARAVAN         0  2.876720e-21             0.090304943
## 16 MOPLLAAG CARAVAN         0  5.011368e-89            -0.189477117
## 17 MBERHOOG CARAVAN         0  2.900864e-41             0.128071750
## 18 MBERZELF CARAVAN         0  1.607065e-06             0.045831436
## 19 MBERBOER CARAVAN         0  3.925686e-37            -0.121247660
## 20 MBERMIDD CARAVAN         0  5.525889e-25             0.098434354
## 21 MBERARBG CARAVAN         0  5.567449e-22            -0.091920717
## 22 MBERARBO CARAVAN         0  1.387714e-34            -0.116835943
## 23     MSKA CARAVAN         0  2.619173e-38             0.123228971
## 24    MSKB1 CARAVAN         0  4.487239e-09             0.056017053
## 25    MSKB2 CARAVAN         0  2.234407e-01             0.011636208
## 26     MSKC CARAVAN         0  4.063955e-18            -0.082795107
## 27     MSKD CARAVAN         0  3.089016e-54            -0.147414944
## 28   MHHUUR CARAVAN         0  7.798318e-70            -0.167651829
## 29    MAUT1 CARAVAN         0  7.159098e-57             0.151032296
## 30    MAUT2 CARAVAN         0  6.951444e-02             0.017347660
## 31    MAUT0 CARAVAN         0  3.898536e-72            -0.170425025
## 32  MZFONDS CARAVAN         0  6.552560e-34            -0.115640344
## 33  MINKM30 CARAVAN         0  4.860766e-74            -0.172684444
## 34 MINK3045 CARAVAN         0  3.470248e-01            -0.008988245
## 35 MINK4575 CARAVAN         0  3.387364e-39             0.124704623
## 36  MINKGEM CARAVAN         0  1.264111e-91             0.192229883
## 37 MKOOPKLA CARAVAN         0 3.342735e-101             0.202030795
## 38  PWALAND CARAVAN         0  3.271787e-07            -0.048786091
## 39 PPERSAUT CARAVAN         0 2.366860e-300             0.343252866
## 40  PBESAUT CARAVAN         0  3.286895e-01            -0.009335966
## 41    PBROM CARAVAN         0  9.641256e-31            -0.109845665
## 42  PZEILPL CARAVAN         0  1.743296e-03             0.029918553
## 43   PFIETS CARAVAN         0  2.640270e-07             0.049171450
## 44  PINBOED CARAVAN         0  1.218064e-01             0.014788186
## 45  AWAPART CARAVAN         0  8.160869e-87             0.187099570
## 46  AWABEDR CARAVAN         0  6.241800e-01            -0.004682935
## 47 APERSAUT CARAVAN         0 7.443765e-232             0.303398100
## 48  AMOTSCO CARAVAN         0  3.372232e-04             0.034255499
## 49  AVRAAUT CARAVAN         0  6.693041e-03            -0.025915395
## 50 AAANHANG CARAVAN         0  2.025697e-03             0.029494793
## 51 ATRACTOR CARAVAN         0  1.006710e-06            -0.046717813
## 52   AWERKT CARAVAN         0  4.743499e-05            -0.038867831
## 53   ALEVEN CARAVAN         0  3.598791e-11             0.063210220
## 54 APERSONG CARAVAN         0  2.661601e-01            -0.010628079
## 55  AGEZONG CARAVAN         0  1.854777e-07             0.049799858
## 56  AWAOREG CARAVAN         0  3.548202e-03             0.027864423
## 57   ABRAND CARAVAN         0  2.786184e-48             0.138880100
## 58 APLEZIER CARAVAN         0  3.264899e-39             0.124731027
## 59 ABYSTAND CARAVAN         0  2.239881e-26             0.101311751
## 60  CARAVAN CARAVAN         0  0.000000e+00             1.000000000
corrplot(cor(subset(df_train , select = c(-CARAVAN))), method = "square", type = "upper")
## Warning in cor(subset(df_train, select = c(-CARAVAN))): the standard deviation
## is zero

# corelation between Contribution car policies and number of car policies 
cor(df_train$PPERSAUT, df_train$APERSAUT)
## [1] 0.9161545
# There is a high correlation between car policies and number of car policies we will exclude a variable which has less correlation with response variable.
# it's not always necessary to see how much it relates which response variable but it's good as it's tells us how much response variable changes for given predictor.
cor(df_train[ , c("PPERSAUT", "APERSAUT")], df_train[ , "CARAVAN"])
##            CARAVAN
## PPERSAUT 0.1509097
## APERSAUT 0.1442105
# Contribution car policies is more correlated with response variable so we exclude Number of car policies
train_2 = data.frame(train_2[ , !colnames(train_2) %in% c("APERSAUT")])
step.wise1 = glm(CARAVAN ~ ., data = train_2, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(step.wise1)
## 
## Call:
## glm(formula = CARAVAN ~ ., family = binomial(link = "logit"), 
##     data = train_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.9869  -0.5540   0.0177   0.7250   1.7969  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.568e+01  4.641e+03  -0.012 0.990428    
## MAANTHUI2    2.473e-02  1.395e-01   0.177 0.859273    
## MGEMOMV2     8.470e-01  4.597e-01   1.842 0.065408 .  
## MGEMOMV3     6.616e-01  4.755e-01   1.391 0.164169    
## MGEMOMV4     6.565e-01  4.962e-01   1.323 0.185748    
## MGEMOMV5     1.574e+00  6.346e-01   2.480 0.013147 *  
## MGEMLEEF2    1.058e+00  5.957e-01   1.777 0.075649 .  
## MGEMLEEF3    1.015e+00  5.996e-01   1.694 0.090345 .  
## MGEMLEEF4    1.515e+00  6.020e-01   2.517 0.011850 *  
## MGEMLEEF5    8.350e-01  6.487e-01   1.287 0.198046    
## MGEMLEEF6    1.432e+00  1.042e+00   1.374 0.169531    
## MOSHOOFD2   -3.020e-01  2.111e-01  -1.431 0.152513    
## MOSHOOFD3   -3.106e-01  1.672e-01  -1.858 0.063235 .  
## MOSHOOFD4   -2.897e+01  5.353e+02  -0.054 0.956838    
## MOSHOOFD5   -1.408e+00  3.373e-01  -4.173 3.00e-05 ***
## MOSHOOFD6   -2.846e+00  4.780e-01  -5.955 2.60e-09 ***
## MOSHOOFD7   -7.086e-01  3.405e-01  -2.081 0.037403 *  
## MOSHOOFD8    2.633e-01  1.975e-01   1.333 0.182439    
## MOSHOOFD9    7.763e-01  2.254e-01   3.444 0.000574 ***
## MOSHOOFD10  -1.729e+00  3.446e-01  -5.019 5.20e-07 ***
## MGODRK1      8.063e-01  1.182e-01   6.823 8.91e-12 ***
## MGODRK2      4.639e-01  1.627e-01   2.852 0.004345 ** 
## MGODRK3     -1.024e+00  3.467e-01  -2.953 0.003143 ** 
## MGODRK4     -2.687e+00  6.463e-01  -4.158 3.21e-05 ***
## MGODRK5     -1.974e-01  7.299e-01  -0.270 0.786796    
## MGODRK6      2.086e+00  7.870e-01   2.651 0.008031 ** 
## MGODRK7     -1.674e+01  2.371e+03  -0.007 0.994366    
## MGODRK8     -1.607e+01  3.125e+03  -0.005 0.995897    
## MGODRK9     -1.358e+01  2.389e+03  -0.006 0.995467    
## MGODPR1     -1.508e-01  6.677e-01  -0.226 0.821359    
## MGODPR2     -8.188e-02  5.986e-01  -0.137 0.891195    
## MGODPR3      4.114e-01  6.088e-01   0.676 0.499203    
## MGODPR4      4.096e-01  6.387e-01   0.641 0.521313    
## MGODPR5      3.632e-01  6.830e-01   0.532 0.594828    
## MGODPR6     -2.115e-01  7.247e-01  -0.292 0.770379    
## MGODPR7      1.079e+00  7.871e-01   1.371 0.170293    
## MGODPR8     -1.027e-01  9.048e-01  -0.114 0.909594    
## MGODPR9      3.955e-01  9.119e-01   0.434 0.664515    
## MGODOV1     -5.610e-01  1.246e-01  -4.503 6.70e-06 ***
## MGODOV2     -1.365e-01  1.722e-01  -0.793 0.428050    
## MGODOV3      5.537e-01  2.575e-01   2.150 0.031553 *  
## MGODOV4     -7.108e-01  4.190e-01  -1.696 0.089815 .  
## MGODOV5      1.467e+00  7.557e-01   1.942 0.052173 .  
## MGODGE1      1.720e-02  2.896e-01   0.059 0.952642    
## MGODGE2     -1.451e-01  2.447e-01  -0.593 0.553294    
## MGODGE3     -1.917e-02  3.023e-01  -0.063 0.949440    
## MGODGE4     -2.587e-01  3.620e-01  -0.715 0.474794    
## MGODGE5     -2.709e-01  4.429e-01  -0.612 0.540786    
## MGODGE6     -2.690e+00  6.147e-01  -4.377 1.20e-05 ***
## MGODGE7     -7.705e-01  6.759e-01  -1.140 0.254296    
## MGODGE8     -1.764e+01  2.119e+03  -0.008 0.993360    
## MGODGE9     -1.368e+01  1.433e+03  -0.010 0.992387    
## MRELGE1     -2.151e+00  1.090e+00  -1.974 0.048432 *  
## MRELGE2     -1.631e+00  9.667e-01  -1.688 0.091478 .  
## MRELGE3      1.536e-01  9.355e-01   0.164 0.869603    
## MRELGE4     -1.682e-02  9.233e-01  -0.018 0.985464    
## MRELGE5      5.717e-01  9.344e-01   0.612 0.540620    
## MRELGE6      6.061e-01  9.430e-01   0.643 0.520425    
## MRELGE7      1.084e+00  9.527e-01   1.138 0.255249    
## MRELGE8      1.745e+00  9.616e-01   1.815 0.069543 .  
## MRELGE9      1.523e+00  9.618e-01   1.584 0.113206    
## MRELSA1     -2.483e-02  1.184e-01  -0.210 0.833897    
## MRELSA2      7.637e-01  1.473e-01   5.183 2.18e-07 ***
## MRELSA3      1.945e+00  2.836e-01   6.860 6.87e-12 ***
## MRELSA4     -2.328e+00  7.104e-01  -3.277 0.001049 ** 
## MRELSA5     -1.513e+01  1.325e+03  -0.011 0.990883    
## MRELSA6     -1.439e+01  1.429e+03  -0.010 0.991965    
## MRELSA7     -1.008e+01  6.523e+03  -0.002 0.998767    
## MFALLEEN1   -5.165e-02  1.426e-01  -0.362 0.717124    
## MFALLEEN2    1.032e-02  1.960e-01   0.053 0.958001    
## MFALLEEN3   -7.899e-02  2.743e-01  -0.288 0.773360    
## MFALLEEN4   -3.249e-01  3.820e-01  -0.850 0.395091    
## MFALLEEN5   -2.565e-01  5.105e-01  -0.503 0.615303    
## MFALLEEN6    5.833e-01  6.519e-01   0.895 0.370912    
## MFALLEEN7   -1.462e+01  2.941e+02  -0.050 0.960355    
## MFALLEEN8    2.934e+00  1.056e+00   2.778 0.005464 ** 
## MFALLEEN9   -2.648e+01  7.889e+02  -0.034 0.973225    
## MFGEKIND1    1.242e+00  3.434e-01   3.616 0.000300 ***
## MFGEKIND2    5.916e-02  3.634e-01   0.163 0.870679    
## MFGEKIND3    2.730e-01  4.002e-01   0.682 0.495122    
## MFGEKIND4    2.220e-01  4.473e-01   0.496 0.619748    
## MFGEKIND5   -5.315e-01  5.237e-01  -1.015 0.310184    
## MFGEKIND6   -1.751e+00  6.049e-01  -2.894 0.003803 ** 
## MFGEKIND7   -3.181e-01  7.388e-01  -0.431 0.666827    
## MFGEKIND8    1.379e+00  1.129e+00   1.221 0.221900    
## MFGEKIND9   -4.465e+00  1.718e+00  -2.599 0.009343 ** 
## MFWEKIND1   -8.756e-01  5.301e-01  -1.652 0.098573 .  
## MFWEKIND2   -1.526e+00  5.501e-01  -2.775 0.005526 ** 
## MFWEKIND3   -1.813e+00  5.893e-01  -3.076 0.002095 ** 
## MFWEKIND4   -1.955e+00  6.432e-01  -3.039 0.002371 ** 
## MFWEKIND5   -2.568e+00  6.810e-01  -3.771 0.000162 ***
## MFWEKIND6   -1.922e+00  7.538e-01  -2.550 0.010773 *  
## MFWEKIND7   -2.188e+00  8.241e-01  -2.655 0.007939 ** 
## MFWEKIND8   -2.006e+00  9.120e-01  -2.200 0.027802 *  
## MFWEKIND9   -1.604e+00  9.763e-01  -1.643 0.100350    
## MOPLHOOG1    5.062e-01  1.367e-01   3.704 0.000212 ***
## MOPLHOOG2   -4.161e-01  1.863e-01  -2.234 0.025504 *  
## MOPLHOOG3   -1.306e+00  2.850e-01  -4.583 4.59e-06 ***
## MOPLHOOG4   -1.070e+00  3.720e-01  -2.875 0.004039 ** 
## MOPLHOOG5   -1.403e+00  4.760e-01  -2.948 0.003196 ** 
## MOPLHOOG6   -3.955e+00  6.746e-01  -5.863 4.54e-09 ***
## MOPLHOOG7   -3.750e+00  8.154e-01  -4.599 4.25e-06 ***
## MOPLHOOG8   -4.574e+00  9.221e-01  -4.960 7.05e-07 ***
## MOPLHOOG9   -3.814e+00  1.260e+00  -3.027 0.002471 ** 
## MOPLMIDD1   -7.079e-01  4.222e-01  -1.677 0.093564 .  
## MOPLMIDD2   -1.294e+00  3.923e-01  -3.299 0.000970 ***
## MOPLMIDD3   -2.323e+00  4.355e-01  -5.334 9.58e-08 ***
## MOPLMIDD4   -2.950e+00  4.825e-01  -6.114 9.73e-10 ***
## MOPLMIDD5   -3.033e+00  5.496e-01  -5.518 3.43e-08 ***
## MOPLMIDD6   -3.682e+00  6.424e-01  -5.732 9.92e-09 ***
## MOPLMIDD7   -3.310e+00  7.574e-01  -4.370 1.24e-05 ***
## MOPLMIDD8   -4.253e+00  9.123e-01  -4.661 3.14e-06 ***
## MOPLMIDD9   -4.700e+00  1.007e+00  -4.669 3.03e-06 ***
## MOPLLAAG1   -8.815e-01  4.153e-01  -2.123 0.033775 *  
## MOPLLAAG2   -4.446e-01  4.053e-01  -1.097 0.272636    
## MOPLLAAG3   -1.086e+00  4.438e-01  -2.447 0.014425 *  
## MOPLLAAG4   -2.306e+00  5.004e-01  -4.610 4.04e-06 ***
## MOPLLAAG5   -2.962e+00  5.771e-01  -5.133 2.85e-07 ***
## MOPLLAAG6   -3.244e+00  6.455e-01  -5.025 5.02e-07 ***
## MOPLLAAG7   -4.090e+00  7.409e-01  -5.520 3.39e-08 ***
## MOPLLAAG8   -4.308e+00  8.632e-01  -4.991 6.00e-07 ***
## MOPLLAAG9   -6.877e+00  9.624e-01  -7.145 9.00e-13 ***
## MBERHOOG1   -1.950e-01  1.851e-01  -1.054 0.292068    
## MBERHOOG2    6.960e-01  1.984e-01   3.508 0.000451 ***
## MBERHOOG3    5.062e-01  2.439e-01   2.075 0.037988 *  
## MBERHOOG4    1.336e+00  3.050e-01   4.380 1.19e-05 ***
## MBERHOOG5    9.132e-01  4.108e-01   2.223 0.026225 *  
## MBERHOOG6    3.161e+00  4.898e-01   6.454 1.09e-10 ***
## MBERHOOG7    3.574e+00  6.346e-01   5.632 1.78e-08 ***
## MBERHOOG8    3.077e+01  3.420e+02   0.090 0.928304    
## MBERHOOG9    3.430e+00  1.063e+00   3.227 0.001252 ** 
## MBERZELF1    7.318e-01  1.222e-01   5.988 2.12e-09 ***
## MBERZELF2    6.734e-01  1.845e-01   3.649 0.000263 ***
## MBERZELF3    3.612e-01  4.992e-01   0.724 0.469347    
## MBERZELF4    3.021e+00  8.841e-01   3.417 0.000634 ***
## MBERZELF5    2.364e+00  1.008e+00   2.344 0.019055 *  
## MBERBOER1    8.081e-01  1.285e-01   6.287 3.24e-10 ***
## MBERBOER2    1.293e+00  2.028e-01   6.378 1.80e-10 ***
## MBERBOER3    3.050e+00  3.592e-01   8.491  < 2e-16 ***
## MBERBOER4    7.835e-01  6.137e-01   1.277 0.201705    
## MBERBOER5    4.697e+00  5.957e-01   7.885 3.16e-15 ***
## MBERBOER6   -1.386e+01  1.486e+03  -0.009 0.992560    
## MBERBOER7   -2.865e+01  2.699e+03  -0.011 0.991532    
## MBERBOER8    1.651e+01  2.595e+03   0.006 0.994923    
## MBERBOER9   -7.171e+00  3.151e+03  -0.002 0.998184    
## MBERMIDD1   -8.425e-01  2.748e-01  -3.066 0.002168 ** 
## MBERMIDD2    3.484e-01  2.321e-01   1.501 0.133341    
## MBERMIDD3    5.156e-01  2.716e-01   1.898 0.057657 .  
## MBERMIDD4    1.976e+00  3.205e-01   6.167 6.95e-10 ***
## MBERMIDD5    2.053e+00  3.824e-01   5.369 7.93e-08 ***
## MBERMIDD6    3.205e+00  4.621e-01   6.936 4.04e-12 ***
## MBERMIDD7    3.785e+00  5.680e-01   6.664 2.67e-11 ***
## MBERMIDD8   -1.901e+01  1.296e+03  -0.015 0.988299    
## MBERMIDD9    5.808e+00  7.250e-01   8.011 1.13e-15 ***
## MBERARBG1    1.322e+00  1.614e-01   8.190 2.60e-16 ***
## MBERARBG2    1.207e+00  1.827e-01   6.602 4.04e-11 ***
## MBERARBG3    1.239e+00  2.354e-01   5.263 1.42e-07 ***
## MBERARBG4    1.779e+00  2.960e-01   6.010 1.86e-09 ***
## MBERARBG5    2.992e+00  3.763e-01   7.951 1.86e-15 ***
## MBERARBG6    1.010e+00  4.861e-01   2.078 0.037672 *  
## MBERARBG7    5.884e+00  6.486e-01   9.072  < 2e-16 ***
## MBERARBG8    4.767e+00  6.977e-01   6.833 8.30e-12 ***
## MBERARBG9    5.708e+00  9.441e-01   6.046 1.48e-09 ***
## MBERARBO1    2.737e-03  1.638e-01   0.017 0.986669    
## MBERARBO2    4.957e-01  1.810e-01   2.739 0.006169 ** 
## MBERARBO3    1.014e+00  2.251e-01   4.505 6.64e-06 ***
## MBERARBO4    2.073e+00  2.825e-01   7.338 2.17e-13 ***
## MBERARBO5    2.918e+00  3.832e-01   7.614 2.65e-14 ***
## MBERARBO6    2.689e+00  4.789e-01   5.614 1.98e-08 ***
## MBERARBO7    4.024e+00  6.718e-01   5.990 2.10e-09 ***
## MBERARBO8    7.196e+00  1.051e+00   6.849 7.41e-12 ***
## MBERARBO9   -1.225e+01  1.017e+03  -0.012 0.990385    
## MSKA1       -2.653e-01  1.885e-01  -1.407 0.159341    
## MSKA2       -4.119e-01  2.124e-01  -1.939 0.052491 .  
## MSKA3        6.776e-02  2.746e-01   0.247 0.805128    
## MSKA4        8.485e-01  3.677e-01   2.308 0.021018 *  
## MSKA5        5.817e-02  4.629e-01   0.126 0.900002    
## MSKA6        2.018e+00  5.871e-01   3.436 0.000589 ***
## MSKA7        1.045e+00  7.049e-01   1.482 0.138263    
## MSKA8       -2.375e+01  3.420e+02  -0.069 0.944637    
## MSKA9       -1.781e+01  8.603e+02  -0.021 0.983485    
## MSKB11       5.226e-01  1.592e-01   3.282 0.001030 ** 
## MSKB12      -2.192e-01  1.850e-01  -1.185 0.236063    
## MSKB13       1.912e-02  2.489e-01   0.077 0.938787    
## MSKB14      -1.077e+00  3.449e-01  -3.123 0.001788 ** 
## MSKB15       2.055e-02  5.185e-01   0.040 0.968396    
## MSKB16       9.098e-01  7.342e-01   1.239 0.215247    
## MSKB17      -1.631e+01  2.309e+03  -0.007 0.994363    
## MSKB18       1.813e+00  1.025e+00   1.769 0.076933 .  
## MSKB19      -1.774e+01  1.191e+03  -0.015 0.988115    
## MSKB21      -7.709e-01  1.849e-01  -4.169 3.06e-05 ***
## MSKB22      -4.214e-01  1.995e-01  -2.112 0.034677 *  
## MSKB23      -8.949e-01  2.555e-01  -3.502 0.000461 ***
## MSKB24      -4.427e-01  3.263e-01  -1.357 0.174865    
## MSKB25      -8.778e-01  4.046e-01  -2.170 0.030038 *  
## MSKB26      -1.140e+00  5.896e-01  -1.934 0.053139 .  
## MSKB27      -1.903e+01  1.699e+03  -0.011 0.991064    
## MSKB28      -4.674e+01  1.933e+03  -0.024 0.980707    
## MSKB29      -1.718e+01  4.078e+03  -0.004 0.996638    
## MSKC1        1.160e+00  3.826e-01   3.031 0.002437 ** 
## MSKC2        8.299e-01  3.621e-01   2.292 0.021903 *  
## MSKC3        8.902e-01  3.869e-01   2.301 0.021396 *  
## MSKC4        1.042e+00  4.283e-01   2.433 0.014956 *  
## MSKC5        6.059e-01  4.807e-01   1.261 0.207485    
## MSKC6        1.055e+00  5.451e-01   1.936 0.052883 .  
## MSKC7        1.425e+00  6.419e-01   2.220 0.026427 *  
## MSKC8        2.517e+00  7.123e-01   3.533 0.000411 ***
## MSKC9        3.336e-01  7.923e-01   0.421 0.673677    
## MSKD1       -2.832e-01  1.192e-01  -2.376 0.017499 *  
## MSKD2       -9.825e-02  1.681e-01  -0.585 0.558774    
## MSKD3       -1.318e+00  2.645e-01  -4.983 6.27e-07 ***
## MSKD4       -1.431e+00  4.085e-01  -3.503 0.000461 ***
## MSKD5       -2.717e+00  8.370e-01  -3.246 0.001171 ** 
## MSKD6       -7.333e-01  8.536e-01  -0.859 0.390317    
## MSKD7       -2.114e+00  1.237e+00  -1.709 0.087465 .  
## MSKD9       -1.423e+01  6.523e+03  -0.002 0.998259    
## MHHUUR1     -1.678e-01  1.824e-01  -0.920 0.357684    
## MHHUUR2     -7.118e-01  1.698e-01  -4.193 2.76e-05 ***
## MHHUUR3      8.266e-02  1.752e-01   0.472 0.637065    
## MHHUUR4     -3.037e-01  1.804e-01  -1.684 0.092199 .  
## MHHUUR5     -2.926e-01  1.922e-01  -1.522 0.128004    
## MHHUUR6     -2.424e-01  1.949e-01  -1.244 0.213547    
## MHHUUR7     -6.432e-01  2.114e-01  -3.043 0.002343 ** 
## MHHUUR8     -3.609e-01  2.251e-01  -1.603 0.108826    
## MHHUUR9      4.163e-01  1.955e-01   2.129 0.033248 *  
## MAUT11      -3.033e+01  7.956e+03  -0.004 0.996958    
## MAUT12      -8.668e+00  4.556e+03  -0.002 0.998482    
## MAUT13      -1.367e+01  4.556e+03  -0.003 0.997607    
## MAUT14       1.160e+01  4.544e+03   0.003 0.997963    
## MAUT15       1.187e+01  4.544e+03   0.003 0.997916    
## MAUT16       1.153e+01  4.544e+03   0.003 0.997975    
## MAUT17       1.137e+01  4.544e+03   0.003 0.998004    
## MAUT18       1.117e+01  4.544e+03   0.002 0.998038    
## MAUT19       9.651e+00  4.544e+03   0.002 0.998305    
## MAUT21       1.801e-01  1.464e-01   1.230 0.218575    
## MAUT22      -6.341e-01  2.034e-01  -3.118 0.001820 ** 
## MAUT23      -1.297e+00  3.301e-01  -3.930 8.50e-05 ***
## MAUT24      -1.086e+00  4.377e-01  -2.481 0.013110 *  
## MAUT25      -3.359e+00  7.341e-01  -4.576 4.75e-06 ***
## MAUT26       2.180e+01  3.420e+02   0.064 0.949162    
## MAUT27       4.572e+01  6.617e+03   0.007 0.994487    
## MAUT01      -6.577e-01  1.815e-01  -3.625 0.000289 ***
## MAUT02      -1.984e-01  2.131e-01  -0.931 0.351934    
## MAUT03       1.007e-01  3.190e-01   0.316 0.752140    
## MAUT04      -2.284e+00  4.361e-01  -5.237 1.63e-07 ***
## MAUT05      -7.763e-01  6.056e-01  -1.282 0.199855    
## MAUT06       2.503e+01  3.420e+02   0.073 0.941663    
## MAUT07       1.203e+00  9.367e+02   0.001 0.998976    
## MAUT08       2.766e+01  6.649e+03   0.004 0.996680    
## MAUT09      -2.477e+01  4.647e+03  -0.005 0.995748    
## MZFONDS1    -2.224e+01  1.602e+03  -0.014 0.988923    
## MZFONDS2    -6.531e-01  4.144e-01  -1.576 0.114995    
## MZFONDS3    -6.428e-01  4.312e-01  -1.491 0.136008    
## MZFONDS4    -6.913e-01  4.151e-01  -1.665 0.095828 .  
## MZFONDS5    -6.088e-01  4.204e-01  -1.448 0.147589    
## MZFONDS6    -1.367e+00  4.263e-01  -3.206 0.001347 ** 
## MZFONDS7    -7.084e-01  4.267e-01  -1.660 0.096840 .  
## MZFONDS8    -2.359e-01  4.415e-01  -0.534 0.593173    
## MZFONDS9    -5.751e-01  4.441e-01  -1.295 0.195366    
## MINKM301    -9.116e-02  1.607e-01  -0.567 0.570516    
## MINKM302     2.801e-01  1.706e-01   1.642 0.100682    
## MINKM303    -9.788e-01  2.462e-01  -3.976 7.02e-05 ***
## MINKM304    -1.503e+00  3.169e-01  -4.743 2.10e-06 ***
## MINKM305    -9.943e-01  3.862e-01  -2.575 0.010034 *  
## MINKM306    -2.087e-02  4.928e-01  -0.042 0.966217    
## MINKM307    -1.517e+00  6.087e-01  -2.492 0.012698 *  
## MINKM308    -3.477e+00  8.239e-01  -4.220 2.44e-05 ***
## MINKM309    -3.141e+01  5.668e+02  -0.055 0.955812    
## MINK30451   -4.754e-01  3.010e-01  -1.579 0.114242    
## MINK30452   -9.798e-01  2.499e-01  -3.921 8.83e-05 ***
## MINK30453   -1.207e+00  2.698e-01  -4.473 7.71e-06 ***
## MINK30454   -1.463e+00  3.127e-01  -4.677 2.91e-06 ***
## MINK30455   -1.891e+00  3.711e-01  -5.095 3.49e-07 ***
## MINK30456   -2.379e+00  4.381e-01  -5.429 5.65e-08 ***
## MINK30457   -2.220e+00  5.026e-01  -4.417 1.00e-05 ***
## MINK30458   -1.624e+00  6.895e-01  -2.356 0.018485 *  
## MINK30459   -8.544e-01  6.765e-01  -1.263 0.206607    
## MINK45751   -2.163e-01  2.074e-01  -1.043 0.296922    
## MINK45752   -3.100e-01  1.974e-01  -1.570 0.116395    
## MINK45753   -2.052e-01  2.155e-01  -0.953 0.340839    
## MINK45754   -7.950e-01  2.563e-01  -3.101 0.001927 ** 
## MINK45755   -7.896e-01  3.110e-01  -2.539 0.011131 *  
## MINK45756   -6.813e-01  3.871e-01  -1.760 0.078426 .  
## MINK45757   -1.466e+00  4.825e-01  -3.038 0.002379 ** 
## MINK45758   -3.455e+00  6.277e-01  -5.505 3.69e-08 ***
## MINK45759   -3.124e+00  6.659e-01  -4.691 2.72e-06 ***
## MINKGEM1     6.468e+01  9.906e+02   0.065 0.947936    
## MINKGEM2     4.620e+01  9.460e+02   0.049 0.961043    
## MINKGEM3     4.638e+01  9.460e+02   0.049 0.960896    
## MINKGEM4     4.669e+01  9.460e+02   0.049 0.960638    
## MINKGEM5     4.645e+01  9.460e+02   0.049 0.960833    
## MINKGEM6     4.519e+01  9.460e+02   0.048 0.961897    
## MINKGEM7     4.711e+01  9.460e+02   0.050 0.960283    
## MINKGEM8     4.490e+01  9.460e+02   0.047 0.962142    
## MINKGEM9     2.440e+01  1.375e+03   0.018 0.985840    
## MKOOPKLA2   -3.734e-02  2.925e-01  -0.128 0.898440    
## MKOOPKLA3   -6.823e-01  3.087e-01  -2.210 0.027071 *  
## MKOOPKLA4   -4.512e-01  3.423e-01  -1.318 0.187543    
## MKOOPKLA5   -1.347e+00  3.536e-01  -3.808 0.000140 ***
## MKOOPKLA6   -6.537e-01  3.522e-01  -1.856 0.063416 .  
## MKOOPKLA7    1.767e-02  3.907e-01   0.045 0.963936    
## MKOOPKLA8   -8.979e-01  3.886e-01  -2.311 0.020858 *  
## PWALAND2    -1.782e+01  3.118e+03  -0.006 0.995439    
## PWALAND3    -1.075e+00  3.978e-01  -2.702 0.006895 ** 
## PWALAND4    -1.858e+00  5.201e-01  -3.573 0.000353 ***
## PPERSAUT4   -1.978e+01  6.523e+03  -0.003 0.997581    
## PPERSAUT5    2.756e-02  1.253e-01   0.220 0.825909    
## PPERSAUT6    1.988e+00  7.110e-02  27.961  < 2e-16 ***
## PPERSAUT7   -1.741e+01  8.026e+02  -0.022 0.982698    
## PPERSAUT8   -1.973e+01  3.859e+03  -0.005 0.995921    
## PBESAUT5    -1.874e+01  1.534e+03  -0.012 0.990258    
## PBESAUT6     4.183e-01  3.944e-01   1.061 0.288888    
## PBESAUT7    -1.439e+01  3.953e+03  -0.004 0.997096    
## PBROM2      -6.239e-01  4.616e-01  -1.352 0.176535    
## PBROM3      -7.136e-01  1.835e-01  -3.890 0.000100 ***
## PBROM4      -1.661e+01  6.754e+02  -0.025 0.980380    
## PBROM5      -1.071e+00  6.897e-01  -1.553 0.120536    
## PBROM6      -1.368e+01  6.523e+03  -0.002 0.998327    
## PZEILPL1     3.671e-01  1.312e+00   0.280 0.779590    
## PZEILPL3    -1.622e+01  6.523e+03  -0.002 0.998016    
## PFIETS1      1.066e+00  1.949e-01   5.470 4.51e-08 ***
## PINBOED1     2.245e-01  4.374e-01   0.513 0.607809    
## PINBOED2    -1.857e-01  4.984e-01  -0.373 0.709489    
## PINBOED3    -1.922e+01  2.324e+03  -0.008 0.993402    
## PINBOED4    -1.860e+01  2.980e+03  -0.006 0.995021    
## PINBOED5    -2.467e+00  6.572e+03   0.000 0.999700    
## PINBOED6    -1.544e+01  6.523e+03  -0.002 0.998111    
## AWAPART1     5.010e-01  8.200e-02   6.111 9.93e-10 ***
## AWAPART2    -1.789e+01  2.028e+03  -0.009 0.992961    
## AWABEDR1     1.588e-01  3.349e-01   0.474 0.635276    
## AWABEDR5    -1.896e+01  6.523e+03  -0.003 0.997681    
## AMOTSCO1     1.439e-01  1.529e-01   0.942 0.346389    
## AMOTSCO2     3.647e-01  5.149e-01   0.708 0.478683    
## AMOTSCO8    -1.857e+01  6.523e+03  -0.003 0.997728    
## AVRAAUT1    -1.853e+01  2.302e+03  -0.008 0.993577    
## AVRAAUT2    -1.661e+01  3.985e+03  -0.004 0.996674    
## AVRAAUT3     3.145e+00  6.572e+03   0.000 0.999618    
## AAANHANG1    1.280e+00  3.270e-01   3.914 9.08e-05 ***
## AAANHANG2   -1.618e+01  3.185e+03  -0.005 0.995947    
## AAANHANG3   -1.724e+01  4.063e+03  -0.004 0.996616    
## ATRACTOR1   -1.635e-01  2.876e-01  -0.569 0.569688    
## ATRACTOR2    5.434e-01  6.091e-01   0.892 0.372300    
## ATRACTOR3   -1.584e+01  2.947e+03  -0.005 0.995711    
## ATRACTOR4   -1.317e+01  2.101e+03  -0.006 0.994998    
## AWERKT1     -1.527e+01  1.136e+03  -0.013 0.989275    
## AWERKT2     -1.333e+01  2.026e+03  -0.007 0.994753    
## AWERKT3     -1.344e+01  4.264e+03  -0.003 0.997485    
## AWERKT6      2.825e+01  8.304e+03   0.003 0.997286    
## ALEVEN1     -1.206e+00  2.001e-01  -6.029 1.65e-09 ***
## ALEVEN2     -3.808e-01  2.092e-01  -1.820 0.068781 .  
## ALEVEN3      2.846e-01  4.755e-01   0.599 0.549491    
## ALEVEN4      2.064e+00  6.322e-01   3.264 0.001097 ** 
## ALEVEN8      3.292e+01  7.600e+03   0.004 0.996544    
## APERSONG1   -3.023e+00  7.167e-01  -4.218 2.46e-05 ***
## AGEZONG1     6.306e-01  3.266e-01   1.931 0.053531 .  
## AWAOREG1     3.122e+00  4.513e-01   6.918 4.58e-12 ***
## AWAOREG2    -1.712e+01  2.480e+03  -0.007 0.994492    
## ABRAND1      4.405e-01  8.366e-02   5.266 1.40e-07 ***
## ABRAND2     -6.604e-02  2.253e-01  -0.293 0.769453    
## ABRAND3     -1.969e+01  1.777e+03  -0.011 0.991160    
## ABRAND4     -1.690e+01  2.754e+03  -0.006 0.995105    
## ABRAND5     -9.792e+00  3.361e+03  -0.003 0.997676    
## ABRAND7     -1.581e+01  6.523e+03  -0.002 0.998067    
## APLEZIER1    4.965e+00  4.010e-01  12.380  < 2e-16 ***
## APLEZIER2    1.203e+00  1.648e+00   0.730 0.465559    
## ABYSTAND1    9.914e-01  2.105e-01   4.710 2.48e-06 ***
## ABYSTAND2   -1.709e+01  6.523e+03  -0.003 0.997909    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177.2  on 10947  degrees of freedom
## Residual deviance:  9114.4  on 10581  degrees of freedom
## AIC: 9848.4
## 
## Number of Fisher Scoring iterations: 17
#step(step.wise1, direction = "forward")
# MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD + 
#     MGODRK + MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFALLEEN + 
#     MFGEKIND + MFWEKIND + MOPLHOOG + MOPLMIDD + MOPLLAAG + MBERHOOG + 
#     MBERZELF + MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKA + 
#     MSKB1 + MSKB2 + MSKC + MHHUUR + MAUT1 + MAUT2 + MAUT0 + 
#     MZFONDS + MINKM30 + MINK3045 + MINK4575 + MINKGEM + MKOOPKLA + 
#     PWALAND + PPERSAUT + PBESAUT + PBROM + PZEILPL + PFIETS + 
#     PINBOED + AWAPART + AWABEDR + AMOTSCO + AVRAAUT + AAANHANG + 
#     ATRACTOR + AWERKT + ALEVEN + APERSONG + AGEZONG + AWAOREG + 
#     ABRAND + APLEZIER + ABYSTAND

# List of variables suggested by stepwise foward by most to least siginificat we will pick first few and run the model so we avoid over fitting model

Model 2

model.2 = glm(formula = CARAVAN ~ MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD + 
    MGODRK + MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFWEKIND + MOPLMIDD, family = binomial(link = "logit"), data = train_2)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.2$xlevels[["MGEMOMV"]] <- union(model.2$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))

drewSummary(model.2)
## 
## Call:
## glm(formula = CARAVAN ~ MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD + 
##     MGODRK + MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFWEKIND + 
##     MOPLMIDD, family = binomial(link = "logit"), data = train_2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1110  -1.0969   0.2797   1.0680   1.8113  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.681e+00  6.338e-01  -2.652 0.007993 ** 
## MAANTHUI2   -1.929e-01  7.869e-02  -2.451 0.014250 *  
## MGEMOMV2    -5.269e-03  1.768e-01  -0.030 0.976223    
## MGEMOMV3     2.318e-02  1.927e-01   0.120 0.904284    
## MGEMOMV4    -6.654e-02  2.095e-01  -0.318 0.750812    
## MGEMOMV5    -3.993e-01  2.906e-01  -1.374 0.169308    
## MGEMLEEF2   -1.655e-01  3.071e-01  -0.539 0.589946    
## MGEMLEEF3   -1.019e-01  3.071e-01  -0.332 0.739945    
## MGEMLEEF4   -1.067e-01  3.105e-01  -0.344 0.731152    
## MGEMLEEF5    1.697e-01  3.300e-01   0.514 0.607106    
## MGEMLEEF6    1.330e+00  4.723e-01   2.816 0.004858 ** 
## MOSHOOFD2    4.219e-01  9.172e-02   4.600 4.22e-06 ***
## MOSHOOFD3   -1.007e-01  8.831e-02  -1.141 0.254047    
## MOSHOOFD4   -1.547e+01  1.795e+02  -0.086 0.931330    
## MOSHOOFD5   -7.519e-01  1.179e-01  -6.378 1.79e-10 ***
## MOSHOOFD6   -1.590e+00  1.850e-01  -8.595  < 2e-16 ***
## MOSHOOFD7   -9.481e-01  1.035e-01  -9.163  < 2e-16 ***
## MOSHOOFD8   -3.360e-01  7.705e-02  -4.361 1.30e-05 ***
## MOSHOOFD9   -1.804e-01  9.277e-02  -1.944 0.051843 .  
## MOSHOOFD10  -1.579e+00  1.481e-01 -10.662  < 2e-16 ***
## MGODRK1      3.504e-01  6.052e-02   5.790 7.04e-09 ***
## MGODRK2      2.541e-01  8.933e-02   2.844 0.004450 ** 
## MGODRK3     -5.153e-01  1.743e-01  -2.957 0.003111 ** 
## MGODRK4     -1.683e+00  3.799e-01  -4.430 9.40e-06 ***
## MGODRK5     -5.342e-01  4.487e-01  -1.191 0.233842    
## MGODRK6      1.221e+00  4.558e-01   2.678 0.007401 ** 
## MGODRK7     -1.514e+01  5.899e+02  -0.026 0.979525    
## MGODRK8     -1.579e+01  7.871e+02  -0.020 0.983998    
## MGODRK9     -1.425e+01  6.072e+02  -0.023 0.981275    
## MGODPR1      1.451e+00  3.803e-01   3.817 0.000135 ***
## MGODPR2      9.502e-01  3.332e-01   2.852 0.004347 ** 
## MGODPR3      9.822e-01  3.480e-01   2.822 0.004770 ** 
## MGODPR4      8.948e-01  3.584e-01   2.497 0.012532 *  
## MGODPR5      1.005e+00  3.826e-01   2.626 0.008634 ** 
## MGODPR6      8.127e-01  4.088e-01   1.988 0.046796 *  
## MGODPR7      1.389e+00  4.416e-01   3.145 0.001663 ** 
## MGODPR8      6.150e-01  5.217e-01   1.179 0.238427    
## MGODPR9      5.977e-01  5.201e-01   1.149 0.250512    
## MGODOV1     -3.755e-01  6.721e-02  -5.587 2.31e-08 ***
## MGODOV2     -3.380e-02  9.372e-02  -0.361 0.718357    
## MGODOV3      4.612e-01  1.553e-01   2.969 0.002989 ** 
## MGODOV4     -3.277e-01  2.231e-01  -1.469 0.141913    
## MGODOV5     -1.279e+00  4.578e-01  -2.794 0.005198 ** 
## MGODGE1      2.625e-01  1.534e-01   1.711 0.087026 .  
## MGODGE2      1.885e-01  1.341e-01   1.405 0.159903    
## MGODGE3      9.415e-02  1.721e-01   0.547 0.584340    
## MGODGE4     -5.907e-02  2.025e-01  -0.292 0.770458    
## MGODGE5     -3.207e-02  2.429e-01  -0.132 0.894967    
## MGODGE6     -1.355e+00  3.371e-01  -4.020 5.82e-05 ***
## MGODGE7      5.306e-01  3.726e-01   1.424 0.154418    
## MGODGE8     -1.497e+01  5.933e+02  -0.025 0.979865    
## MGODGE9     -1.403e+01  4.740e+02  -0.030 0.976388    
## MRELGE1      1.954e-01  3.977e-01   0.491 0.623205    
## MRELGE2     -4.963e-01  3.596e-01  -1.380 0.167513    
## MRELGE3     -1.325e-02  3.328e-01  -0.040 0.968229    
## MRELGE4     -1.709e-01  3.286e-01  -0.520 0.603059    
## MRELGE5      1.457e-01  3.152e-01   0.462 0.643918    
## MRELGE6      4.258e-01  3.183e-01   1.338 0.181002    
## MRELGE7      3.888e-01  3.167e-01   1.228 0.219566    
## MRELGE8      5.922e-01  3.286e-01   1.802 0.071484 .  
## MRELGE9      5.245e-01  3.191e-01   1.644 0.100225    
## MRELSA1     -1.707e-02  6.328e-02  -0.270 0.787386    
## MRELSA2      4.414e-02  7.443e-02   0.593 0.553207    
## MRELSA3      5.743e-01  1.613e-01   3.560 0.000371 ***
## MRELSA4     -1.210e+00  3.311e-01  -3.654 0.000258 ***
## MRELSA5     -1.468e+01  3.321e+02  -0.044 0.964739    
## MRELSA6     -1.489e+01  3.898e+02  -0.038 0.969517    
## MRELSA7     -1.574e+01  1.455e+03  -0.011 0.991370    
## MFWEKIND1    3.714e-01  2.211e-01   1.680 0.092903 .  
## MFWEKIND2    6.420e-01  2.125e-01   3.021 0.002521 ** 
## MFWEKIND3    4.613e-01  2.150e-01   2.145 0.031942 *  
## MFWEKIND4    2.995e-01  2.222e-01   1.348 0.177807    
## MFWEKIND5    9.043e-02  2.257e-01   0.401 0.688714    
## MFWEKIND6    3.157e-01  2.300e-01   1.372 0.169937    
## MFWEKIND7    2.626e-01  2.427e-01   1.082 0.279200    
## MFWEKIND8    6.435e-01  2.577e-01   2.497 0.012533 *  
## MFWEKIND9    5.518e-01  2.603e-01   2.120 0.034027 *  
## MOPLMIDD1    5.160e-01  1.268e-01   4.069 4.72e-05 ***
## MOPLMIDD2    4.192e-01  1.088e-01   3.855 0.000116 ***
## MOPLMIDD3    3.048e-01  1.055e-01   2.888 0.003874 ** 
## MOPLMIDD4    4.256e-01  1.050e-01   4.054 5.03e-05 ***
## MOPLMIDD5    4.330e-01  1.117e-01   3.876 0.000106 ***
## MOPLMIDD6    3.691e-01  1.320e-01   2.797 0.005155 ** 
## MOPLMIDD7    6.855e-01  1.619e-01   4.234 2.29e-05 ***
## MOPLMIDD8    4.890e-01  2.741e-01   1.784 0.074364 .  
## MOPLMIDD9    8.628e-01  2.561e-01   3.368 0.000756 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 13767  on 10862  degrees of freedom
## AIC: 13939
## 
## Number of Fisher Scoring iterations: 14
drewMatrix(model.2, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2380  162
##          1 1382   76
##                                           
##                Accuracy : 0.614           
##                  95% CI : (0.5987, 0.6291)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0141         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.31933         
##             Specificity : 0.63264         
##          Pos Pred Value : 0.05213         
##          Neg Pred Value : 0.93627         
##              Prevalence : 0.05950         
##          Detection Rate : 0.01900         
##    Detection Prevalence : 0.36450         
##       Balanced Accuracy : 0.47598         
##                                           
##        'Positive' Class : 1               
## 
# difference in deviance =  Null deviance (15177) - 13808 = 1369

explanation

when removing MBERMIDD (middle management) from predictors Sensitivity jump from 45 to 51 while Accuracy is constant 0.62

Model 3 with Domain knowledge

corrplot(cor(subset(df_train , select = c("PBRAND", "MOSTYPE", "PPERSAUT", "MKOOPKLA", "MHKOOP", "CARAVAN"))), method = "number", type = "upper")

model.3 = glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA+MHKOOP, family = binomial(link = "logit"), 
    data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.3$xlevels[["PPERSAUT"]] <- union(model.3$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))

predicted_3 = predict(model.3, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_3 = ifelse(predicted_3>=0.5, 1, 0)

drewSummary(model.3)
## 
## Call:
## glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + 
##     MHKOOP, family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2167  -0.8931   0.2117   0.9058   2.0486  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   14.47544  541.16916   0.027 0.978660    
## PBRAND1       -0.49043    0.18776  -2.612 0.009002 ** 
## PBRAND2       -1.03340    0.12524  -8.251  < 2e-16 ***
## PBRAND3        0.69376    0.06388  10.860  < 2e-16 ***
## PBRAND4        0.76574    0.05608  13.654  < 2e-16 ***
## PBRAND5        0.08495    0.13987   0.607 0.543606    
## PBRAND6       -0.65475    0.18569  -3.526 0.000422 ***
## PBRAND7      -16.12863  661.61420  -0.024 0.980551    
## PBRAND8      -16.62515 2399.54472  -0.007 0.994472    
## MOSTYPE2      -2.15601    0.49860  -4.324 1.53e-05 ***
## MOSTYPE3      -1.92075    0.47355  -4.056 4.99e-05 ***
## MOSTYPE4      -2.68118    0.53154  -5.044 4.56e-07 ***
## MOSTYPE5      -0.59567    1.11153  -0.536 0.592031    
## MOSTYPE6      -0.23614    0.18596  -1.270 0.204125    
## MOSTYPE7      -2.02196    0.51227  -3.947 7.91e-05 ***
## MOSTYPE8      -0.28159    0.34959  -0.805 0.420536    
## MOSTYPE9      -1.47723    1.05150  -1.405 0.160058    
## MOSTYPE10     -0.60305    0.18771  -3.213 0.001316 ** 
## MOSTYPE11     -2.23698    0.46451  -4.816 1.47e-06 ***
## MOSTYPE12     -0.27661    0.34960  -0.791 0.428817    
## MOSTYPE13     -1.73295    0.47191  -3.672 0.000240 ***
## MOSTYPE15    -30.93764 1156.29826  -0.027 0.978655    
## MOSTYPE16    -31.85353  745.27915  -0.043 0.965908    
## MOSTYPE17    -17.68629  723.95163  -0.024 0.980509    
## MOSTYPE18    -31.56943  727.42727  -0.043 0.965384    
## MOSTYPE19    -17.14510 1259.35956  -0.014 0.989138    
## MOSTYPE20    -15.72297  541.16893  -0.029 0.976822    
## MOSTYPE21    -31.62526  784.41773  -0.040 0.967841    
## MOSTYPE22    -16.28268  541.16888  -0.030 0.975997    
## MOSTYPE23     -0.87605    1.08504  -0.807 0.419444    
## MOSTYPE24    -16.42394  541.16891  -0.030 0.975789    
## MOSTYPE25    -16.04630  541.16917  -0.030 0.976345    
## MOSTYPE26    -16.51090  541.16926  -0.031 0.975661    
## MOSTYPE27    -15.77384  541.16924  -0.029 0.976747    
## MOSTYPE28    -31.53412  694.45273  -0.045 0.963782    
## MOSTYPE29     -1.37372    1.10094  -1.248 0.212113    
## MOSTYPE30    -16.93557  541.16892  -0.031 0.975035    
## MOSTYPE31    -16.43894  541.16918  -0.030 0.975767    
## MOSTYPE32    -15.30423  541.16917  -0.028 0.977439    
## MOSTYPE33     -0.46526    1.07886  -0.431 0.666282    
## MOSTYPE34     -2.60424    0.48346  -5.387 7.18e-08 ***
## MOSTYPE35     -2.22303    1.01694  -2.186 0.028815 *  
## MOSTYPE36      0.13243    1.08345   0.122 0.902715    
## MOSTYPE37     -0.84203    1.04883  -0.803 0.422074    
## MOSTYPE38     -0.86215    1.04810  -0.823 0.410746    
## MOSTYPE39     -1.74458    1.01726  -1.715 0.086347 .  
## MOSTYPE40    -16.66488  253.34626  -0.066 0.947554    
## MOSTYPE41     -1.99373    1.05542  -1.889 0.058885 .  
## PPERSAUT4    -15.14102 2399.54472  -0.006 0.994965    
## PPERSAUT5     -0.02331    0.09145  -0.255 0.798806    
## PPERSAUT6      1.54736    0.04968  31.149  < 2e-16 ***
## PPERSAUT7    -15.57694  332.95364  -0.047 0.962685    
## PPERSAUT8    -15.09995 1145.06773  -0.013 0.989479    
## MKOOPKLA2      0.42888    0.53644   0.799 0.424009    
## MKOOPKLA3    -15.48158  541.16810  -0.029 0.977177    
## MKOOPKLA4    -14.46865  541.16815  -0.027 0.978670    
## MKOOPKLA5    -13.99046  541.16821  -0.026 0.979375    
## MKOOPKLA6    -13.54198  541.16896  -0.025 0.980036    
## MKOOPKLA7    -14.67534  541.16904  -0.027 0.978366    
## MKOOPKLA8    -15.42718  541.16914  -0.029 0.977258    
## MHKOOP1       -0.28509    0.11407  -2.499 0.012448 *  
## MHKOOP2       -0.30769    0.11538  -2.667 0.007659 ** 
## MHKOOP3        0.20704    0.11531   1.795 0.072578 .  
## MHKOOP4        0.05319    0.10592   0.502 0.615543    
## MHKOOP5       -0.19963    0.10595  -1.884 0.059549 .  
## MHKOOP6        0.29310    0.10168   2.883 0.003942 ** 
## MHKOOP7       -0.01303    0.09947  -0.131 0.895790    
## MHKOOP8        0.53559    0.10992   4.872 1.10e-06 ***
## MHKOOP9        0.24573    0.09717   2.529 0.011446 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 12039  on 10879  degrees of freedom
## AIC: 12177
## 
## Number of Fisher Scoring iterations: 15
drewMatrix(model.3, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2247   70
##          1 1515  168
##                                          
##                Accuracy : 0.6038         
##                  95% CI : (0.5884, 0.619)
##     No Information Rate : 0.9405         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0789         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.70588        
##             Specificity : 0.59729        
##          Pos Pred Value : 0.09982        
##          Neg Pred Value : 0.96979        
##              Prevalence : 0.05950        
##          Detection Rate : 0.04200        
##    Detection Prevalence : 0.42075        
##       Balanced Accuracy : 0.65159        
##                                          
##        'Positive' Class : 1              
## 
# Area under curve
pr <- prediction(predictedClass_3,over_test$CARAVAN)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf) > auc(over_test$CARAVAN,predictedClass_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

## logical(0)
auc_ROCR <- performance(pr, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
auc_ROCR
## [1] 0.6515855
pR2(model.3)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.2067468
accuracy(predictedClass_3, as.numeric(over_test$CARAVAN))
##               ME      RMSE     MAE    MPE   MAPE
## Test set 0.63875 0.8208228 0.63875 60.025 60.025
# difference in deviance =  Null deviance (15177) - 12152 = 3025

Explanation AUC roc plot logistic regression Our AUC score is 0.7176653

Evaluting performance of model 3

pred_t <- predict(model.3, na.action=na.pass)
hist(pred_t)

boxplot(pred_t)

##Plotting residual histograms for training and validation data
resid.t<-residuals(model.3)
hist(resid.t)

Roc

r <- roc(over_test$CARAVAN, predicted_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(r)

Lift chart

lift.example <- lift(relevel(as.factor(over_test$CARAVAN), ref="1") ~ predicted_3, data = over_test)
#xyplot(lift.example, plot = "gain")

Decil Wise chart

library(gains)
actual = as.numeric(over_test$CARAVAN)
predicted_3_num = as.numeric(predicted_3)
gain = gains(actual, predicted_3_num)

barplot(gain$mean.resp / mean(actual), names.arg = gain$depth, xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")

Decision tree.

fit1 <- rpart(formula=CARAVAN ~ .,data=over_train,control=rpart.control(minsplit=20, minbucket=1, cp=0.008))

fit1
## n= 10948 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##   1) root 10948 5474 0 (0.50000000 0.50000000)  
##     2) PPERSAUT=0,4,5,7,8 4770 1353 0 (0.71635220 0.28364780)  
##       4) MOSTYPE=3,4,6,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,33,34,35,40,41 2376  322 0 (0.86447811 0.13552189) *
##       5) MOSTYPE=1,2,5,7,8,10,11,12,13,32,36,37,38,39 2394 1031 0 (0.56934002 0.43065998)  
##        10) PBRAND=2,5,6,7,8 177    0 0 (1.00000000 0.00000000) *
##        11) PBRAND=0,1,3,4 2217 1031 0 (0.53495715 0.46504285)  
##          22) MINKM30=3,4,7,8,9 404   85 0 (0.78960396 0.21039604) *
##          23) MINKM30=0,1,2,5,6 1813  867 1 (0.47821291 0.52178709)  
##            46) MSKC=0,2,3,7 794  305 0 (0.61586902 0.38413098)  
##              92) MINK7512=0,2,5,6,9 402   83 0 (0.79353234 0.20646766) *
##              93) MINK7512=1,3,4 392  170 1 (0.43367347 0.56632653)  
##               186) MOSTYPE=1,7,10,11,12,32,37 60    0 0 (1.00000000 0.00000000) *
##               187) MOSTYPE=5,8,13,36,38,39 332  110 1 (0.33132530 0.66867470) *
##            47) MSKC=1,4,5,6,8,9 1019  378 1 (0.37095191 0.62904809)  
##              94) PBRAND=0,1,3 672  314 1 (0.46726190 0.53273810)  
##               188) MOSTYPE=5,10,11,12,39 122   13 0 (0.89344262 0.10655738) *
##               189) MOSTYPE=1,2,7,8,13,32,36,37,38 550  205 1 (0.37272727 0.62727273)  
##                 378) MHHUUR=4,5,6,8,9 152   50 0 (0.67105263 0.32894737) *
##                 379) MHHUUR=0,1,2,3,7 398  103 1 (0.25879397 0.74120603) *
##              95) PBRAND=4 347   64 1 (0.18443804 0.81556196) *
##     3) PPERSAUT=6 6178 2057 1 (0.33295565 0.66704435)  
##       6) PBRAND=0,1,2,6,7 2383 1131 1 (0.47461183 0.52538817)  
##        12) MOSTYPE=5,7,9,16,17,18,19,21,22,23,26,28,29,30,31,32,40,41 458   80 0 (0.82532751 0.17467249) *
##        13) MOSTYPE=1,2,3,4,6,8,10,11,12,13,20,24,25,27,33,34,35,36,37,38,39 1925  753 1 (0.39116883 0.60883117)  
##          26) MBERHOOG=0,5 432  171 0 (0.60416667 0.39583333)  
##            52) MOSTYPE=2,3,4,6,10,11,13,20,25,27,33,34,37,39 195   15 0 (0.92307692 0.07692308) *
##            53) MOSTYPE=8,12,24,35,36,38 237   81 1 (0.34177215 0.65822785) *
##          27) MBERHOOG=1,2,3,4,6,7,8,9 1493  492 1 (0.32953784 0.67046216)  
##            54) MINKGEM=0,1,3,9 251   98 0 (0.60956175 0.39043825)  
##             108) MHKOOP=0,1,2,3,5,6,8,9 115    0 0 (1.00000000 0.00000000) *
##             109) MHKOOP=4,7 136   38 1 (0.27941176 0.72058824) *
##            55) MINKGEM=2,4,5,6,7,8 1242  339 1 (0.27294686 0.72705314) *
##       7) PBRAND=3,4,5 3795  926 1 (0.24400527 0.75599473) *
glm_6 = glm(formula = CARAVAN ~  PPERSAUT+MOSTYPE+PBRAND+MBERHOOG+MBERMIDD, family = binomial(link = "logit"),
    data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_6$xlevels[["PPERSAUT"]] <- union(glm_6$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
summary(glm_6)
## 
## Call:
## glm(formula = CARAVAN ~ PPERSAUT + MOSTYPE + PBRAND + MBERHOOG + 
##     MBERMIDD, family = binomial(link = "logit"), data = over_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3155  -0.9003   0.1773   0.8854   2.1747  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.34361    0.18569  -7.236 4.63e-13 ***
## PPERSAUT4    -15.27497 2399.54472  -0.006 0.994921    
## PPERSAUT5     -0.00800    0.09172  -0.087 0.930494    
## PPERSAUT6      1.52601    0.04975  30.672  < 2e-16 ***
## PPERSAUT7    -15.60973  332.65955  -0.047 0.962574    
## PPERSAUT8    -14.85777 1124.93891  -0.013 0.989462    
## MOSTYPE2      -0.29771    0.23434  -1.270 0.203938    
## MOSTYPE3      -0.18950    0.17141  -1.106 0.268923    
## MOSTYPE4      -0.86524    0.29107  -2.973 0.002952 ** 
## MOSTYPE5      -0.65396    0.31672  -2.065 0.038941 *  
## MOSTYPE6      -0.18984    0.19068  -0.996 0.319453    
## MOSTYPE7      -0.27473    0.29135  -0.943 0.345703    
## MOSTYPE8       0.45296    0.16787   2.698 0.006969 ** 
## MOSTYPE9      -0.69670    0.18987  -3.669 0.000243 ***
## MOSTYPE10     -0.48945    0.19884  -2.461 0.013836 *  
## MOSTYPE11     -0.36872    0.19783  -1.864 0.062351 .  
## MOSTYPE12      0.55440    0.20352   2.724 0.006448 ** 
## MOSTYPE13     -0.12444    0.18549  -0.671 0.502292    
## MOSTYPE15    -15.69382 1022.90291  -0.015 0.987759    
## MOSTYPE16    -16.45638  521.76132  -0.032 0.974839    
## MOSTYPE17    -16.39451  732.21944  -0.022 0.982137    
## MOSTYPE18    -16.10746  505.67583  -0.032 0.974589    
## MOSTYPE19    -16.72381 1316.24883  -0.013 0.989863    
## MOSTYPE20      0.11840    0.34379   0.344 0.730553    
## MOSTYPE21    -16.02262  580.47407  -0.028 0.977979    
## MOSTYPE22     -0.40175    0.24052  -1.670 0.094844 .  
## MOSTYPE23     -0.97333    0.21252  -4.580 4.65e-06 ***
## MOSTYPE24     -0.60393    0.21915  -2.756 0.005857 ** 
## MOSTYPE25     -0.76852    0.28075  -2.737 0.006194 ** 
## MOSTYPE26     -0.99809    0.36573  -2.729 0.006353 ** 
## MOSTYPE27     -0.57514    0.33182  -1.733 0.083040 .  
## MOSTYPE28    -16.11485  438.67563  -0.037 0.970696    
## MOSTYPE29     -1.26481    0.27093  -4.668 3.04e-06 ***
## MOSTYPE30     -0.96621    0.24120  -4.006 6.18e-05 ***
## MOSTYPE31     -0.86678    0.21375  -4.055 5.01e-05 ***
## MOSTYPE32      0.24365    0.21148   1.152 0.249271    
## MOSTYPE33     -0.25391    0.16529  -1.536 0.124497    
## MOSTYPE34     -0.64158    0.20226  -3.172 0.001514 ** 
## MOSTYPE35     -0.67526    0.19043  -3.546 0.000391 ***
## MOSTYPE36      0.19205    0.18397   1.044 0.296514    
## MOSTYPE37      0.19145    0.20323   0.942 0.346177    
## MOSTYPE38      0.13589    0.17904   0.759 0.447866    
## MOSTYPE39     -0.14607    0.18039  -0.810 0.418105    
## MOSTYPE40    -16.41800  251.57322  -0.065 0.947966    
## MOSTYPE41     -0.79557    0.21133  -3.765 0.000167 ***
## PBRAND1       -0.43791    0.19018  -2.303 0.021296 *  
## PBRAND2       -1.04821    0.12516  -8.375  < 2e-16 ***
## PBRAND3        0.64871    0.06351  10.214  < 2e-16 ***
## PBRAND4        0.81945    0.05612  14.601  < 2e-16 ***
## PBRAND5        0.33021    0.14071   2.347 0.018938 *  
## PBRAND6       -0.57105    0.18588  -3.072 0.002126 ** 
## PBRAND7      -15.99307  665.07867  -0.024 0.980815    
## PBRAND8      -16.37908 2399.54473  -0.007 0.994554    
## MBERHOOG1      0.22021    0.07335   3.002 0.002680 ** 
## MBERHOOG2      0.31208    0.07077   4.410 1.04e-05 ***
## MBERHOOG3      0.22234    0.08192   2.714 0.006641 ** 
## MBERHOOG4      0.67134    0.10102   6.646 3.02e-11 ***
## MBERHOOG5     -0.04999    0.13366  -0.374 0.708401    
## MBERHOOG6      0.95395    0.14942   6.384 1.72e-10 ***
## MBERHOOG7      1.17888    0.17718   6.654 2.86e-11 ***
## MBERHOOG8      0.39174    0.32089   1.221 0.222168    
## MBERHOOG9      0.02271    0.32018   0.071 0.943459    
## MBERMIDD1     -0.05638    0.12197  -0.462 0.643908    
## MBERMIDD2     -0.02386    0.09190  -0.260 0.795146    
## MBERMIDD3      0.09312    0.09578   0.972 0.330930    
## MBERMIDD4      0.38201    0.10114   3.777 0.000159 ***
## MBERMIDD5      0.40873    0.11613   3.520 0.000432 ***
## MBERMIDD6      0.48162    0.14190   3.394 0.000688 ***
## MBERMIDD7      0.31078    0.15090   2.060 0.039440 *  
## MBERMIDD8    -16.57883  573.49310  -0.029 0.976938    
## MBERMIDD9      0.49245    0.19778   2.490 0.012779 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 15177  on 10947  degrees of freedom
## Residual deviance: 12017  on 10877  degrees of freedom
## AIC: 12159
## 
## Number of Fisher Scoring iterations: 15
predicted_6 = predict(glm_6, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_6 = ifelse(predicted_6>=0.5, 1, 0)


confusionMatrix(as.factor(predictedClass_6), as.factor(over_test$CARAVAN), positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2687  132
##          1 1075  106
##                                           
##                Accuracy : 0.6982          
##                  95% CI : (0.6838, 0.7125)
##     No Information Rate : 0.9405          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0559          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.44538         
##             Specificity : 0.71425         
##          Pos Pred Value : 0.08975         
##          Neg Pred Value : 0.95317         
##              Prevalence : 0.05950         
##          Detection Rate : 0.02650         
##    Detection Prevalence : 0.29525         
##       Balanced Accuracy : 0.57981         
##                                           
##        'Positive' Class : 1               
## 
accuracy(predictedClass_6, as.numeric(over_test$CARAVAN))
##               ME      RMSE     MAE  MPE MAPE
## Test set 0.76425 0.9111806 0.76425 71.8 71.8
pR2(glm_6)['McFadden']
## fitting null model for pseudo-r2
##  McFadden 
## 0.2081893
# difference in deviance =  Null deviance (15177) - 12069 = 3108

Model comparsion using anova test

anova(logit.reg,model.2,model.3,glm_6,test = 'Chisq')
## Analysis of Deviance Table
## 
## Model 1: CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + MGEMLEEF_c
## Model 2: CARAVAN ~ MAANTHUI + MGEMOMV + MGEMLEEF + MOSHOOFD + MGODRK + 
##     MGODPR + MGODOV + MGODGE + MRELGE + MRELSA + MFWEKIND + MOPLMIDD
## Model 3: CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + MHKOOP
## Model 4: CARAVAN ~ PPERSAUT + MOSTYPE + PBRAND + MBERHOOG + MBERMIDD
##   Resid. Df Resid. Dev  Df Deviance  Pr(>Chi)    
## 1     10931      14399                           
## 2     10862      13767  69   631.50 < 2.2e-16 ***
## 3     10879      12039 -17  1727.76              
## 4     10877      12017   2    21.89 1.762e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1